require(data.table)
require(readstata13)
require(stringi)
require(quanteda)
require(expss)

prefecture.names.jp <- c("北海道", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "茨城県",
                         "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "新潟県", "富山県",
                         "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県", "愛知県", "三重県",
                         "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "鳥取県", "島根県",
                         "岡山県", "広島県", "山口県", "徳島県", "香川県", "愛媛県", "高知県", "福岡県",
                         "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県")
prefecture.names.en <- c("Hokkaido", "Aomori", "Iwate", "Miyagi", "Akita", "Yamagata", "Fukushima", 
                         "Ibaraki", "Tochigi", "Gunma", "Saitama", "Chiba", "Tokyo", "Kanagawa", 
                         "Niigata", "Toyama", "Ishikawa", "Fukui", "Yamanashi", "Nagano", "Gifu", 
                         "Shizuoka", "Aichi", "Mie", "Shiga", "Kyoto", "Osaka", "Hyogo", "Nara", 
                         "Wakayama", "Tottori", "Shimane", "Okayama", "Hiroshima", "Yamaguchi", 
                         "Tokushima", "Kagawa", "Ehime", "Kochi", "Fukuoka", "Saga", "Nagasaki", 
                         "Kumamoto", "Oita", "Miyazaki", "Kagoshima", "Okinawa")

#### correct the header of Catalinac (2018)'s data ####
text.name <- as.character(unlist(fread("all.1986-2009.reduced.csv", 
                                       header = FALSE)[1, -1]))
text.name <- stri_replace_all_regex(text.name, 
                                    "第３区ﾄｸﾀﾞﾄﾗｵ徳田トラオ",
                                    "奄美群島区ﾄｸﾀﾞﾄﾗｵ徳田トラオ")
text.name <- stri_replace_all_regex(text.name, 
                                    "第３区ﾔｽｵｶｵｷﾊﾙ保岡オキハル",
                                    "奄美群島区ﾔｽｵｶｵｷﾊﾙ保岡オキハル")
text.name <- stri_replace_all_regex(text.name, 
                                    "第３区ｼﾏﾅｶﾞｸﾆｾｷしまなが国積",
                                    "奄美群島ｼﾏﾅｶﾞｸﾆｾｷしまなが国積")
text.name <- stri_replace_all_regex(text.name, 
                                    "X1986.80.千葉県第２区ﾂｼﾞﾀﾐﾉﾙ辻田実.txt",
                                    "X1986.80.千葉県第３区ﾂｼﾞﾀﾐﾉﾙ辻田実.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X1993.695.静岡県第２区ﾔﾅｷﾞｻﾜﾊｸｵ柳沢はくお.txt",
                                    "X1993.695.静岡県第３区ﾔﾅｷﾞｻﾜﾊｸｵ柳沢はくお.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X1993.411.高地県第１区ｲｼﾀﾞﾉﾘﾄｼ石田のりとし.txt", 
                                    "X1993.411.高知県第１区ｲｼﾀﾞﾉﾘﾄｼ石田のりとし.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X1993.412.高地県第１区ｺﾞﾄｳﾏｻﾉﾘ五島正規.txt", 
                                    "X1993.412.高知県第１区ｺﾞﾄｳﾏｻﾉﾘ五島正規.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X1996.565.京都府第１区ｵｸﾀﾞﾐｷｵ奥田みきお.txt",
                                    "X1996.565.京都府第２区ｵｸﾀﾞﾐｷｵ奥田みきお.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X1996.570.京都府第２区ｵｸﾔﾏｼｹﾞﾋｺおくやま茂彦.txt",
                                    "X1996.570.京都府第３区ｵｸﾔﾏｼｹﾞﾋｺおくやま茂彦.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X1996.573.京都府第３区ﾉﾅｶﾋﾛﾑ野中ひろむ.txt",
                                    "X1996.573.京都府第４区ﾉﾅｶﾋﾛﾑ野中ひろむ.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X1996.579.京都府第５区ﾔﾏﾉｲｶｽﾞﾉﾘやまのい和則.txt",
                                    "X1996.579.京都府第６区ﾔﾏﾉｲｶｽﾞﾉﾘやまのい和則.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X2003.33.愛知県第１１区くしだ真吾.txt",
                                    "X2003.33.愛知県第１１区ｸｼﾀﾞｼﾝｺﾞくしだ真吾.txt")
text.name <- stri_replace_all_regex(text.name, 
                                    "X2003.34.愛知県第１１区ふるもと伸一郎.txt",
                                    "X2003.34.愛知県第１１区ﾌﾙﾓﾄｼﾝｲﾁﾛｳふるもと伸一郎.txt")

#### extract candidate information from the header of Catalinac (2018)'s data ####
id <- as.numeric(substr(text.name, 7, 
                        regexpr("\\.", substr(text.name, 7, nchar(text.name))) + 5))
year <- as.numeric(substr(text.name, 2, 5))
prefecture <- substr(text.name, 
                     regexpr("\\.", substr(text.name, 7, nchar(text.name))) + 7, 
                     ifelse(regexpr("第", text.name) == -1, 
                            regexpr("奄美", text.name) - 1, 
                            regexpr("第", text.name) - 1))
district <- ifelse(regexpr("第", text.name) == -1, 4, 
                   as.numeric(stri_trans_nfkc(substr(text.name, 
                                                     regexpr("第", text.name) + 1, 
                                                     regexpr("区", text.name) - 1))))
name <- substr(text.name, 
               ifelse(is.na(stri_locate_last_regex(text.name, "[ｦ-ﾟ]+")[, 2]), 
                      stri_locate_last_regex(text.name, "[ァ-ヴ]+")[, 2] + 1,
                      stri_locate_last_regex(text.name, "[ｦ-ﾟ]+")[, 2] + 1),
               regexpr(".txt", text.name) - 1)
candidate.data <- data.frame(id, year, prefecture, district, name)
candidate.data$name <- as.character(candidate.data$name)

#### combine Catalinac (2018)'s data and Reed-Smith data ####
Reed.Smith.data <- read.dta13("Reed-Smith-JHRED-CANDIDATES.dta")
name.correction.list <- read.csv("name_correction_list.csv", 
                                 fileEncoding = "UTF-8")

merged.data <- as.data.frame(matrix(NA, 
                                    nrow(candidate.data), 
                                    ncol(candidate.data) + ncol(Reed.Smith.data)))
merged.data[, 1:5] <- candidate.data
colnames(merged.data) <- c(colnames(candidate.data), colnames(Reed.Smith.data))
for (i in 1:nrow(candidate.data)) {
  district.subset.data <- subset(Reed.Smith.data, 
                                 year == candidate.data$year[i] & yr %% 1 == 0 & 
                                   kucode == 100 * charmatch(candidate.data$prefecture[i], prefecture.names.jp) + candidate.data$district[i])
  correction.list <- subset(name.correction.list, 
                            name == candidate.data$name[i] & 
                              year == as.character(candidate.data$year[i]) & 
                              prefecture == as.character(candidate.data$prefecture[i]))
  if (nrow(correction.list) == 1) {
    candidate.data$name[i] <- correction.list$name_jp
  }
  merged.data[i, 6:ncol(merged.data)] <- district.subset.data[district.subset.data$name_jp == candidate.data$name[i], ]
}

#### create covariates ####
dfm.matrix <- t(fread("all.1986-2009.reduced.csv", 
                      data.table = FALSE)[, -1])
colnames(dfm.matrix) <- fread("all.1986-2009.reduced.csv", 
                              data.table = FALSE)[, 1]

census.1.300 <- read.csv("選挙区別集計_03年齢別×男女別人口・外国人人口.csv", 
                         skip = 6)
census.1.300[, 2] <- rowSums(census.1.300[, 19:26]) / census.1.300[, 5]

census.2.300 <- read.csv("選挙区別集計_12産業別×男女別15歳以上就業者数.csv", 
                         skip = 5)
census.2.300[, 2] <- rowSums(census.2.300[, c(6, 8)]) / census.2.300[, 5]

DID.300 <- read.csv("DID_2003.csv", fileEncoding = "UTF-8")
DID.300[, 5] <- as.numeric(gsub("%", "", DID.300[, 5])) / 100
DID.300$kucode <- as.numeric(DID.300[, 2]) * 100 + as.numeric(DID.300[, 4])

dfm.matrix <- as.dfm(dfm.matrix)
docvars(dfm.matrix, "name") <- merged.data$name_jp
docvars(dfm.matrix, "female") <- merged.data$female
docvars(dfm.matrix, "male") <- 1 - merged.data$female
docvars(dfm.matrix, "age") <- merged.data$age
docvars(dfm.matrix, "incumbent") <- (merged.data$inc > 0) * 1
docvars(dfm.matrix, "year") <- merged.data$year
docvars(dfm.matrix, "party") <- ifelse(merged.data$party_id == 1 | merged.data$party_id == 1.5, "LDP", 
                                       ifelse(merged.data$party_id == 2 | merged.data$party_id == 2.5, "JSP", 
                                              ifelse(merged.data$party_id == 3 | merged.data$party_id == 3.5, "Komeito", 
                                                     ifelse(merged.data$party_id == 4 | merged.data$party_id == 4.5, "DSP", 
                                                            ifelse(merged.data$party_id == 5 | merged.data$party_id == 5.5, "JCP", 
                                                                   ifelse(merged.data$party_id == 15 | merged.data$party_id == 15.5, "NFP", 
                                                                          ifelse(merged.data$party_id == 16 | merged.data$party_id == 16.5, "DPJ", 
                                                                                 ifelse(merged.data$party_id == 12 | merged.data$party_id == 12.5 | 
                                                                                          merged.data$party_id == 19 | merged.data$party_id == 19.5 | 
                                                                                          merged.data$party_id == 26 | merged.data$party_id == 30, "left", 
                                                                                        ifelse(merged.data$party_id == 11 | merged.data$party_id == 13 | 
                                                                                                 merged.data$party_id == 13.5 | merged.data$party_id == 14 | 
                                                                                                 merged.data$party_id == 17 | merged.data$party_id == 17.5 | 
                                                                                                 merged.data$party_id == 18 | merged.data$party_id == 24 | 
                                                                                                 merged.data$party_id == 25 | merged.data$party_id == 29 | 
                                                                                                 merged.data$party_id == 31 | merged.data$party_id == 32 | 
                                                                                                 merged.data$party_id == 34 | merged.data$party_id == 34.5 | 
                                                                                                 merged.data$party_id == 38, "right", "others")))))))))
docvars(dfm.matrix, "government") <- ((merged.data$party_id == 1 | merged.data$party_id == 1.5 | 
                                         (merged.data$party_id == 11 & merged.data$year == 1986) | 
                                         ((merged.data$party_id == 2 | merged.data$party_id == 2.5 | merged.data$party_id == 12 | merged.data$party_id == 12.5) & merged.data$year == 1996) | 
                                         ((merged.data$party_id == 3 | merged.data$party_id == 3.5) & merged.data$year > 1999) | 
                                         (merged.data$party_id == 25 & (merged.data$year == 2000 | merged.data$year == 2003 | merged.data$year == 2005)))) * 1
docvars(dfm.matrix, "wins") <- log(merged.data$totcwins - (merged.data$result > 0) * 1 + 1)
docvars(dfm.matrix, "result") <- (merged.data$result == 1 | merged.data$result == 2) * 1
merged.data$district.id <- as.numeric(factor(merged.data$year * 10000 + merged.data$kucode))
docvars(dfm.matrix, "competing") <- 0
for (i in 1:max(merged.data$district.id)) {
  district.subset.data <- subset(merged.data, district.id == i)
  docvars(dfm.matrix, "competing")[merged.data$district.id == i] <- ifelse(sum(district.subset.data$female) > 1, 1, 0)
}
docvars(dfm.matrix, "kucode") <- merged.data$kucode
docvars(dfm.matrix, "over65") <- ifelse(merged.data$year >= 2003, 
                                                     vlookup(merged.data$kucode, census.1.300, 2, 1), NA)
docvars(dfm.matrix, "agriculture") <- ifelse(merged.data$year >= 2003, 
                                             vlookup(merged.data$kucode, census.2.300, 2, 1), NA)
docvars(dfm.matrix, "DID") <- ifelse(merged.data$year >= 2003, 
                                     vlookup(merged.data$kucode, DID.300, 5, 8), NA)

save(dfm.matrix, file = "dfm_matrix.Rdata")